home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Tech Arsenal 1
/
Tech Arsenal (Arsenal Computer).ISO
/
tek-02
/
sp12src.zip
/
TOKEN.PAS
< prev
next >
Wrap
Pascal/Delphi Source File
|
1990-12-18
|
13KB
|
424 lines
{$A+,B-,D-,E-,F+,I+,L-,N-,O-,R-,S-,V-}
Unit Token;
Interface
Uses Objects;
Const
TokenStringSize = 35; { Maximum size of a string to be tokenized }
TokenEntryListSize = 20; { Number of tokens per hash entry block }
HashTableSize = 211; { Size of hash table }
Type
TokenStringPtr = ^TokenString;
TokenString = String[TokenStringSize];
TokenTextTablePtr = ^TokenTextTableType;
TokenTextTableType = Record
TokenTextEntry : Array[0..255] Of TokenStringPtr; { Index by Lo(Token) }
End;
HashEntryPtr = ^HashEntryType;
HashEntryType = Record
{ An array of tokens of strings all hashing to the same value }
EntChain : HashEntryPtr; { Blocks chained onto TokTable }
EntTokenCount : Word; { Number of tokens in this block }
EntToken : Array[1..TokenEntryListSize] Of Word;
End;
PToken = ^TToken;
TToken = Object(TObject)
TokMaxToken : Word; { Maximum current token }
Constructor Init;
{ Initialize hash table }
Constructor RestoreHashTable(FileName : String);
{ Restore hash table from named file }
Function TokenText(Token : Word) : TokenString;
{ Return text given token, or null string if token not in table. }
Function TokenInsertText(St : TokenString) : Word;
{ Enter string in hash table if not duplicate; return token }
Procedure TokenUpdateText(Token : Word; St : TokenString);
{ Update the text associated with a token }
Function TextToken(St : TokenString) : Word;
{ Locate text in hash table; return token, or 0 if not found }
Function TokenAddress(Token : Word) : TokenStringPtr;
{ Return address of string represented by Token (no checking) }
Procedure SaveHashTable(FileName : String);
{ Save hash table to named file }
Destructor Done; Virtual;
{ Releases all storage associated with hash table }
Procedure EditMatch(Count : Byte; Var MatchTable;
St : TokenString; TotalMatch : Boolean);
{ Return a set of tokens of strings that most nearly match string St as
determined by EditDistance. Count specifies the maximum number of
tokens to be returned. MatchTable is an array of at least Count
words. Tokens are returned in order of smallest to largest
EditDistance. If TotalMatch is TRUE, all words are examined;
otherwise only words beginning with the same first letter as St are
examined (saves time). }
Function HashListLength(Bucket : Word) : Word;
{ Return the number of entries in the indicated hash bucket entry chain. This
function is for performance analysis purposes only }
Private
TokTextTable : Array[0..255] Of TokenTextTablePtr; { Index by Hi(Token) }
TokHashTable : Array[0..HashTableSize-1] Of HashEntryPtr;
Procedure InsertHashEntry(Token, Hash : Word);
{ Insert Token in hash table chain }
Procedure InsertTextEntry(Token : Word; Var St : TokenString);
{ Insert word string in word table }
Function LocateString(Var St : TokenString; Hash : Word) : Word;
{ Locate text in hash table; return token, or 0 if not found }
End;
Implementation
Uses EditDist, PairHeap;
Const
TextBufSize = 16384; { Size of text buffer for Save/Restore }
SaveMagicNumber = $EF120550; { Magic number for save/restore }
Type
MatchRecordPtr = ^MatchRecord;
MatchRecord = Object(HeapEntry)
{ Used by EditMatch to sort tokens }
Token : Word;
Distance : Word;
End;
HeapControl = Object(TopSoMany)
Function Less(Var x, y : HeapEntry) : Boolean; Virtual;
End;
Function HashPJW(Var s : TokenString) : Word;
{ Hash string to a number between 0 and HashTableSize-1 }
Function HashPJWPrim(Var s : TokenString) : LongInt;
Inline(
$5E/ { pop si}
$58/ { pop ax}
$1E/ { push ds}
$8E/$D8/ { mov ds,ax}
$31/$DB/ { xor bx,bx}
$31/$D2/ { xor dx,dx}
$AC/ { lodsb}
$30/$E4/ { xor ah,ah}
$89/$C1/ { mov cx,ax}
$E3/$2C/ { jcxz done}
{next:}
$D1/$E3/ { shl bx,1}
$D1/$D2/ { rcl dx,1}
$D1/$E3/ { shl bx,1}
$D1/$D2/ { rcl dx,1}
$D1/$E3/ { shl bx,1}
$D1/$D2/ { rcl dx,1}
$D1/$E3/ { shl bx,1}
$D1/$D2/ { rcl dx,1}
$AC/ { lodsb}
$01/$C3/ { add bx,ax}
$83/$D2/$00/ { adc dx,0}
$F6/$C6/$F0/ { test dh,$F0}
$74/$0F/ { jz skip}
$88/$F0/ { mov al,dh}
$80/$E6/$0F/ { and dh,$0F}
$D0/$E8/ { shr al,1}
$D0/$E8/ { shr al,1}
$D0/$E8/ { shr al,1}
$D0/$E8/ { shr al,1}
$30/$C3/ { xor bl,al}
{skip:}
$E2/$D4/ { loop next}
{done:}
$1F/ { pop ds}
$89/$D8); { mov ax,bx}
Begin
HashPJW := HashPJWPrim(s) Mod HashTableSize;
End;
Constructor TToken.Init;
{ Initialize control pointers }
Begin
If Not TObject.Init Then Fail;
TokMaxToken := 0;
FillChar(TokTextTable, SizeOf(TokTextTable), 0);
FillChar(TokHashTable, SizeOf(TokHashTable), 0);
End;
Procedure TToken.InsertHashEntry(Token, Hash : Word);
Var
Entry : HashEntryPtr;
Begin
Entry := TokHashTable[Hash];
If (Entry = Nil) Or (Entry^.EntTokenCount >= TokenEntryListSize) Then
Begin
New(Entry);
If Entry <> Nil Then With Entry^ Do Begin
EntChain := TokHashTable[Hash];
EntTokenCount := 0;
TokHashTable[Hash] := Entry;
End;
End;
If Entry <> Nil Then With Entry^ Do Begin
Inc(EntTokenCount);
EntToken[EntTokenCount] := Token;
End;
End;
Procedure TToken.InsertTextEntry(Token : Word; Var St : TokenString);
{ Insert word string in word table }
Var
j : Word;
Begin
j := Hi(Token);
If TokTextTable[j] = Nil Then Begin
New(TokTextTable[j]);
If TokTextTable[j] <> Nil Then With TokTextTable[j]^ Do Begin
FillChar(TokenTextEntry, SizeOf(TokenTextEntry), 0);
End;
End;
If TokTextTable[j] <> Nil Then With TokTextTable[j]^ Do Begin
j := Lo(Token);
If TokenTextEntry[j] <> Nil Then
FreeMem(TokenTextEntry[j], Succ(Length(TokenTextEntry[j]^)));
GetMem(TokenTextEntry[j], Succ(Length(St)));
If TokenTextEntry[j] <> Nil Then TokenTextEntry[j]^ := St;
End;
End;
Function TToken.LocateString(Var St : TokenString; Hash : Word) : Word;
{ Locate text in hash table; return token, or 0 if not found }
Var
Entry, Trail : HashEntryPtr;
i, Token : Word;
Found : Boolean;
Begin
LocateString := 0;
Entry := TokHashTable[Hash];
Trail := Nil;
Found := False;
While Entry <> Nil Do With Entry^ Do Begin
i := 1;
Repeat
Token := EntToken[i];
Inc(i);
Found := TokTextTable[Hi(Token)]^.TokenTextEntry[Lo(Token)]^ = St;
Until Found Or (i > EntTokenCount);
If Found Then Begin
LocateString := Token;
Dec(i, 2);
If i > 0 Then Begin
EntToken[Succ(i)] := EntToken[i];
EntToken[i] := Token;
End Else If Trail <> Nil Then Begin
i := Trail^.EntTokenCount;
EntToken[1] := Trail^.EntToken[i];
Trail^.EntToken[i] := Token;
End;
Entry := Nil;
End Else Begin
Trail := Entry;
Entry := EntChain;
End;
End;
End;
Function TToken.TokenText(Token : Word) : TokenString;
Begin
TokenText := '';
If TokTextTable[Hi(Token)] <> Nil Then With TokTextTable[Hi(Token)]^ Do
If TokenTextEntry[Lo(Token)] <> Nil Then
TokenText := TokenTextEntry[Lo(Token)]^
End;
Function TToken.TokenInsertText(St : TokenString) : Word;
Var
h, j : Word;
Begin
h := HashPJW(St);
j := LocateString(St, h);
If j = 0 Then Begin
If TokMaxToken < $FFFF Then Begin
Inc(TokMaxToken);
j := TokMaxToken;
InsertTextEntry(j, St);
InsertHashEntry(j, h);
End;
End;
TokenInsertText := j;
End;
Procedure TToken.TokenUpdateText(Token : Word; St : TokenString);
Var
h : Word;
Begin
InsertTextEntry(Token, St);
h := HashPJW(St);
If LocateString(St, h) = 0 Then
InsertHashEntry(Token, h);
If TokMaxToken < Token Then TokMaxToken := Token;
End;
Function TToken.TextToken(St : TokenString) : Word;
Begin
TextToken := LocateString(St, HashPJW(St));
End;
Function TToken.TokenAddress(Token : Word) : TokenStringPtr;
Begin
TokenAddress := TokTextTable[Hi(Token)]^.TokenTextEntry[Lo(Token)];
End;
Procedure TToken.SaveHashTable(FileName : String);
Type
TextBuffer = Array[1..TextBufSize] Of Char;
Var
Buf : ^TextBuffer;
f : Text;
i : Word;
Begin
Assign(f, FileName);
New(Buf);
If Buf <> Nil Then SetTextBuf(f, Buf^, TextBufSize);
ReWrite(f);
WriteLn(f, SaveMagicNumber);
For i := 1 To TokMaxToken Do
WriteLn(f, TokenAddress(i)^);
Close(f);
Dispose(Buf);
End;
Constructor TToken.RestoreHashTable(FileName : String);
Type
TextBuffer = Array[1..TextBufSize] Of Char;
Var
Buf : ^TextBuffer;
n : LongInt;
i : Word;
f : Text;
st : TokenString;
ch : Char;
Begin
TokMaxToken := 0;
FillChar(TokTextTable, SizeOf(TokTextTable), 0);
FillChar(TokHashTable, SizeOf(TokHashTable), 0);
{$I-}
Assign(f, FileName);
New(Buf);
If Buf <> Nil Then SetTextBuf(f, Buf^, TextBufSize);
Reset(f);
{$I+}
If IoResult = 0 Then Begin
ReadLn(f, n);
If n = SaveMagicNumber Then Begin
i := 1;
While Not Eof(f) Do Begin
ReadLn(f, st);
TokenUpdateText(i, st);
Inc(i);
End;
End;
Close(f);
End;
Dispose(Buf);
End;
Destructor TToken.Done;
Var
i, j : Byte;
Entry, Temp : HashEntryPtr;
Begin
For i := 0 To 255 Do If TokTextTable[i] <> Nil Then
With TokTextTable[i]^ Do Begin
For j := 0 To 255 Do If TokenTextEntry[j] <> Nil Then
FreeMem(TokenTextEntry[j], Succ(Length(TokenTextEntry[j]^)));
Dispose(TokTextTable[i]);
End;
For i := 0 To Pred(HashTableSize) Do Begin
Entry := TokHashTable[i];
While Entry <> Nil Do Begin
Temp := Entry^.EntChain;
Dispose(Entry);
Entry := Temp;
End;
End;
End;
Function HeapControl.Less(Var x, y : HeapEntry) : Boolean;
Var
xx : MatchRecord Absolute x;
yy : MatchRecord Absolute y;
Begin
Less := xx.Distance > yy.Distance;
End;
Procedure TToken.EditMatch(Count : Byte; Var MatchTable;
St : TokenString; TotalMatch : Boolean);
Var
Heap : HeapControl;
Rec : MatchRecordPtr;
i, j, Dist : Word;
Match : Array[1..255] Of Word Absolute MatchTable;
Begin
Heap.Init(Count);
With Heap Do Begin
For i := 1 To TokMaxToken Do Begin
If TotalMatch
Or (TokTextTable[Hi(i)]^.TokenTextEntry[Lo(i)]^[1] = St[1])
Or (TokTextTable[Hi(i)]^.TokenTextEntry[Lo(i)]^[1] = St[2])
Then Begin
Dist := EditDistance(St,
TokTextTable[Hi(i)]^.TokenTextEntry[Lo(i)]^);
Rec := GetDiscard;
If Rec = Nil Then New(Rec);
If Rec <> Nil Then Begin
With Rec^ Do Begin
Token := i;
Distance := Dist;
End;
Insert(Rec^);
End;
End;
End;
Repeat
Rec := GetDiscard;
If Rec <> Nil Then Dispose(Rec);
Until Rec = Nil;
j := EntryCount;
End;
For i := Count DownTo 1 Do If i > j Then Match[i] := 0 Else Begin
Rec := Heap.DeleteLowEntry;
Match[i] := Rec^.Token;
Dispose(Rec);
End;
End;
Function TToken.HashListLength(Bucket : Word) : Word;
Var
Count : Word;
Entry : HashEntryPtr;
Begin
HashListLength := 0;
If Bucket < HashTableSize Then Begin
Count := 0;
Entry := TokHashTable[Bucket];
While Entry <> Nil Do With Entry^ Do Begin
Inc(Count, EntTokenCount);
Entry := EntChain;
End;
HashListLength := Count;
End;
End;
End.